1 Executive Summary
Aim of this report - to gain a clearer idea of where the collection originates:
- Butterflies - How does the collection reflect their global distribution and characteristics?
- Flies - A brief glimpse
Main discoveries:
- Butterflies are diversely distributed but most commonly found in tropical regions with warm, temperate climates
- The Morpho genus of butterflies is located exclusively in the Americas, whilst the Vanessa and Argynnis genera are more globally distributed
- The Lycaenidae family of butterflies prefers hotter climates than both the Papilionidae and Nymphalidae families, being primarily located in rainforests
2 Initial Data Analysis (IDA)
2.1 The 5 Stages of Data Preparation
- Importing the Excel spreadsheet
# Importing the data sets from Excel
entomology1_og = read_excel("ProvenanceOfEntomology.xlsx",
sheet = "By location", col_types = c("guess", "guess", "guess", "guess", "guess", "guess", "guess", "guess", "text", "guess", "guess", "guess", "guess", "guess", "guess", "guess", "guess", "guess"))
entomology2_og = read_excel("ProvenanceOfEntomology.xlsx",
sheet = "List by species COMPLETE 2017", col_types = c("guess", "guess", "guess", "guess", "guess", "guess", "guess", "guess", "guess", "text", "guess"))- Preparing the two data sets
## Wrangling 'entomology1_og' into 'entomology1'
# Removing the "LocLocationCode" and "LocLevel3" column
entomology1 = entomology1_og[, c(1, 3, 4, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18)]
# Renaming columns
names(entomology1)[c(2, 3, 14)] = c("Storage Type", "Building", "Total Number of Specimens")
# Splitting the "Building" and "Room" columns into their code and name
entomology1 = entomology1 %>%
tidyr::separate(col = Building,
into = c("Building Code", "Building Name"),
sep = "-",
fill = "left") %>%
tidyr::separate(col = Room,
into = c("Room Code", "Room Name"),
sep = "-",
fill = "left")
# Changing the NA's in the "Room Code" column to "Historic"
entomology1$`Room Code`[is.na(entomology1$`Room Code`)] = "Historic"
# Removing "Room" from the "Room Code" column, and "Cabinet" from "Cabinet" column
entomology1 = entomology1 %>%
mutate(`Room Code` = str_remove(`Room Code`, "Room")) %>%
mutate(Cabinet = str_remove(Cabinet, "Cabinet"))
# Renaming columns again
names(entomology1)[c(7, 8)] = c("Cabinet Code", "Drawer Number")
# Changing the variable names into 'camel case'
entomology1 = janitor::clean_names(entomology1)
# Removing the variable "type_specimen" from 'entomology1'
entomology1 = select(entomology1, -type_specimen)
# Removing duplicate rows
entomology1[!duplicated(entomology1), ]
## Wrangling 'entomology2_og' into 'entomology2'
# Removing the blank column
entomology2 = entomology2_og[, c(1, 2, 3, 5, 6, 7, 8, 9, 10, 11)]
# Renaming columns
names(entomology2)[c(1, 2, 3, 6)] = c("Room Code", "Cabinet Code",
"Drawer Number",
"Number of Specimens")
# Replacing "[empty]", "`", "[not identified]" and "[on display]" with NA's
entomology2[entomology2 == "[empty]"] = NA
entomology2[entomology2 == "`"] = NA
entomology2[entomology2 == "[not identified]"] = NA
entomology2[entomology2 == "[on display]"] = NA
# Changing the variable names into 'camel case'
entomology2 = janitor::clean_names(entomology2)- Merging into one
## Merging the two data frames into 'entomology'
# Changing the class of 'entomology1' and 'entomology2' to data frame
entomology1 = data.frame(entomology1)
entomology2 = data.frame(entomology2)
# Copying the "room_code" and "cabinet_code" variables to new columns
entomology2 = entomology2 %>%
mutate(room_code_copy = room_code) %>%
mutate(cabinet_code_copy = cabinet_code)
# Changing the "room_code" and "cabinet_code" variables from character to numeric types
entomology1[, 5] = as.numeric(as.factor(entomology1[, 5]))
entomology2[, 1] = as.numeric(as.factor(entomology2[, 1]))
entomology1[, 7] = as.numeric(as.factor(entomology1[, 7]))
entomology2[, 2] = as.numeric(as.factor(entomology2[, 2]))
# Merging!!
entomology = left_join(entomology2, entomology1, by = c("room_code", "cabinet_code", "drawer_number"))
# Re-arranging the columns so the actual "room_code" and "cabinet_code" is preserved
entomology = entomology[, c(4, 5, 7, 19, 8, 20, 21, 22, 18, 6, 24, 23, 14, 15, 16, 11, 17, 12, 3, 26, 25, 9, 10, 13)]
# Replacing NA key words with NA's
entomology[, 1:19][entomology[, 1:19] == "Empty"] = NA
entomology[, 21:24][entomology[, 21:24] == "Empty"] = NA
entomology[, 1:19][entomology[, 1:19] == "[on display]"] = NA
entomology[, 21:24][entomology[, 21:24] == "[on display]"] = NA
entomology$locality[entomology$locality == "[no locality]"] = NA
entomology$locality[entomology$locality == "[unknown locality]"] = NA
# Moving a specimen to another cabinet
entomology[18][entomology[1] == "Belongs in 10B drawer 7 [not identified]"] = "10B"
entomology[1][entomology[1] == "Belongs in 10B drawer 7 [not identified]"] = NA
# Accounting for NA's in "building_name" and "room_name"
entomology$building_name[entomology$room_code_copy == "Historic"] = "Elizabeth Bay House"
entomology$room_name[entomology$room_code_copy == "Historic"] = "Library"
entomology$building_name[entomology$room_code_copy == "111"] = "Badham Building"
entomology$room_name[entomology$room_code_copy == "111"] = "Macleay Entomology store"
entomology$building_code[entomology$room_code_copy == "111"] = "A16"
# Changing NA's in "number_of_specimens" and "total_number_of_specimens" to 0
entomology$number_of_specimens[is.na(entomology$number_of_specimens) == TRUE] = 0
entomology$total_number_of_specimens[is.na(entomology$total_number_of_specimens) == TRUE] = 0
# Changing the variables to their appropriate type
entomology$number_of_specimens = as.numeric(as.character(entomology$number_of_specimens))
# Removing an inconsistent row
entomology = subset(entomology, number_of_specimens != 89225)
# Changing values containing "?", "[illegible]" and "[Historic drawer]" to NA and "Rob Blackburn" to "Robert Blackburn"
entomology = entomology %>%
mutate_if(is.character, function(x) {
gsub("\\?", NA, x)
})
entomology = entomology %>%
mutate_if(is.character, function(x) {
gsub("\\[illegible]", NA, x)
})
entomology = entomology %>%
mutate_if(is.character, function(x) {
gsub("\\[empty]", NA, x)
})
entomology = entomology %>%
mutate_if(is.character, function(x) {
gsub("\\[Historic drawer]", NA, x)
})
entomology = entomology %>%
mutate_if(is.character, function(x) {
gsub("Rob Blackburn", "Robert Blackburn", x)
})
# Changing the column names
# "order.x" --> "order_indiv",
# "order.y" --> "order_drawer",
# "family.x" --> "family_indiv",
# "family.y" --> "family_drawer",
# "sub_family" --> "sub_family_drawer",
# "i_d" --> "id",
# "number_of_specimens" --> "num_indiv_specimens",
# "total_number_of_specimens" --> "num_drawer_specimens",
# "room_code_copy" --> "room_code",
# "cabinet_code_copy" --> "cabinet_code"
names(entomology)[c(3, 4, 5, 6, 7, 8, 10, 11, 16, 18)] = c("order_indiv", "order_drawer", "family_indiv", "family_drawer", "sub_family_drawer", "id", "num_indiv_specimens", "num_drawer_specimens", "room_code", "cabinet_code")- Inserting taxonomic & location API data
## Merging the given 'taxonomyData' with 'entomology' to add the GBIF API data
# Importing 'taxonomyData' as 'taxonomy'
taxonomy = readr::read_csv("taxonomyData.csv")
# Changing the "user_supplied_name" and "name_in_label" variables from factor to character types
taxonomy$user_supplied_name = as.character(taxonomy$user_supplied_name)
entomology$name_in_label = as.character(entomology$name_in_label)
# Merging...
entomology = left_join(entomology, taxonomy, by = c("name_in_label" = "user_supplied_name"))
# Removing the column "submitted_name" and "num_total_specimens"
entomology = select(entomology, -submitted_name)
# Re-arranging the column order
entomology = entomology[, c(1, 25, 2, 26:28, 3:4, 29, 5:7, 30:32, 8:24)]
## Merging the given 'locationData' with 'entomology' to add the Google Maps API data
# Importing 'locationData' as 'location'
location = readr::read_csv("locationData.csv")
# Changing the "uniqueLocation" and "locality" variables from factor to character types
location$uniqueLocation = as.character(location$uniqueLocation)
entomology$locality = as.character(entomology$locality)
# Merging...
entomology = left_join(entomology, location, by = c("locality" = "uniqueLocation"))
# Re-arranging columns
entomology = entomology[, c(1:3, 37, 33:36, 38:41, 4:32)]
# Removing the "irn" variable
entomology = select(entomology, -irn)
# Changing the variable types from factor to character if the variable is already assigned as a factor
entomology = entomology %>%
mutate_if(is.factor, as.character)- Extracting useful location data from the address column
## Extracting a "country_region" column from the "address" column
# Reversing the address around its commas since each country is generally positioned at the end
reversed_address = sapply(lapply(str_split(entomology$address, ","), rev), paste)
# Extracting each country from 'reversed_address'
countries = purrr::map_chr(reversed_address, 1)
# Trimming whitespaces
countries = trimws(countries, "both")
# Cleaning entries with numbers preceding the country
countries_cleaned = str_match(countries, "^\\d{4}\\s(\\w+)$")
unique(countries_cleaned) # <-- only "hungary" appears to have this number pattern preceding it
countries_cleaned = lapply(countries, gsub, pattern = "^\\d{4}\\s(\\w+)$", replacement= "hungary")
# Now looking at the rest of the entries, there are a lot of continents, which is a little too broad for our liking
check = unique(countries_cleaned)
# We edited a version of 'check' which we exported as a csv - this was to create a key value pair for replacement of the values
# Importing the csv as 'replacement'
replacement = read.csv("replacement.csv")
# Creating a vectorised replacement
vector_replacement <- with(replacement, setNames(replace, original))
# Clean the countries column using the vector
country_region = countries_cleaned
for (item in replacement$original) {
country_region = sapply(country_region, gsub, pattern = item, replacement = as.character(vector_replacement[item]))
}
# Binding "country_region" to 'entomology' as a new column
entomology = cbind(entomology, country_region)
# Moving "country_region" to the 4th index
entomology = entomology[, c(1:3, 41, 4:40)]
# Replacing "NA" with NA in the "country_region" column
entomology$country_region[entomology$country_region == "NA"] = NA
# Changing the variable type of the "country_region" column from factor to character
entomology$country_region = as.character(entomology$country_region)2.2 A Glimpse of the Data Set
# Quick look at 5 rows in the data set
kable(entomology[928:933, ], "html") %>%
kableExtra::kable_styling(bootstrap_options = c("striped", "hover")) %>%
kableExtra::scroll_box(width = "100%")| name_in_label | classification_name | locality | country_region | address | lon | lat | type | loctype | north | south | east | west | kingdom | phylum | class | order_indiv | order_drawer | order | family_indiv | family_drawer | sub_family_drawer | family | genus | species | id | drawer_details | num_indiv_specimens | num_drawer_specimens | notes | storage_type | building_code | building_name | room_code | room_name | cabinet_code | drawer_number | date_recorded | recorded_by | synonymy_l_gray_2017 | synonymy_r_blackburn_2017 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 928 | Acalles alboguttatus Chevrolat | Acalles albovittatus Fiedler, 1940 | Mexico | mexico | mexico | -102.55278 | 23.63450 | country | approximate | 32.71865 | 14.38950 | -86.5887 | -118.65230 | Animalia | Arthropoda | Insecta | Coleoptera | Coleoptera | Coleoptera | Curculionidae | NA | NA | Curculionidae | Acalles | Acalles albovittatus | NA | NA | 2 | 152 | NA | Storage | A16 | Badham Building | 111 | Macleay Entomology store | 79 | 1 | NA | Robert Blackburn | NA | NA |
| 929 | Acalles camelus Fabricius | Acalles camelus Dejean, 1836 | Vienna, Austria | austria | vienna, austria | 16.37382 | 48.20817 | locality | approximate | 48.32310 | 48.11827 | 16.5775 | 16.18262 | Animalia | Arthropoda | Insecta | Coleoptera | Coleoptera | Coleoptera | Curculionidae | NA | NA | Curculionidae | Acalles | Acalles camelus | NA | NA | 1 | 152 | NA | Storage | A16 | Badham Building | 111 | Macleay Entomology store | 79 | 1 | NA | Robert Blackburn | NA | NA |
| 930 | Acalles erroneus Pascoe | Acalles erroneus Pasc., 1876 | Tairua, New Zealand | new zealand | tairua, new zealand | 175.84973 | -36.99606 | locality | approximate | -36.98561 | -37.05148 | 175.8724 | 175.81802 | Animalia | Arthropoda | Insecta | Coleoptera | Coleoptera | Coleoptera | Curculionidae | NA | NA | Curculionidae | Acalles | Acalles erroneus | NA | NA | 1 | 152 | NA | Storage | A16 | Badham Building | 111 | Macleay Entomology store | 79 | 1 | NA | Robert Blackburn | NA | NA |
| 931 | Acalles histriculus Pascoe | Acalles hystriculus Pasc., 1876 | Tairua, New Zealand | new zealand | tairua, new zealand | 175.84973 | -36.99606 | locality | approximate | -36.98561 | -37.05148 | 175.8724 | 175.81802 | Animalia | Arthropoda | Insecta | Coleoptera | Coleoptera | Coleoptera | Curculionidae | NA | NA | Curculionidae | Acalles | Acalles hystriculus | NA | NA | 1 | 152 | NA | Storage | A16 | Badham Building | 111 | Macleay Entomology store | 79 | 1 | NA | Robert Blackburn | NA | NA |
| 932 | Acalles hypocritus Boheman | Acalles hypocrita Dejean | Vienna, Austria | austria | vienna, austria | 16.37382 | 48.20817 | locality | approximate | 48.32310 | 48.11827 | 16.5775 | 16.18262 | Animalia | Arthropoda | Insecta | Coleoptera | Coleoptera | Coleoptera | Curculionidae | NA | NA | Curculionidae | Acalles | Acalles hypocrita | NA | NA | 1 | 152 | NA | Storage | A16 | Badham Building | 111 | Macleay Entomology store | 79 | 1 | NA | Robert Blackburn | NA | NA |
| 933 | Acalles perditus Pasc | Acalles perditus Pascoe, 1874 | New South Wales | australia | new south wales, australia | 146.92110 | -31.25322 | administrative_area_level_1 | approximate | -28.15702 | -37.50528 | 159.1054 | 140.99928 | Animalia | Arthropoda | Insecta | Coleoptera | Coleoptera | Coleoptera | Curculionidae | NA | NA | Curculionidae | Acalles | Acalles perditus | NA | NA | 1 | 152 | NA | Storage | A16 | Badham Building | 111 | Macleay Entomology store | 79 | 1 | NA | Robert Blackburn | NA | Decilaus perditus (Pascoe, 1874) |
# Size of the data and R's classification of the variables
str(entomology)## 'data.frame': 35602 obs. of 41 variables:
## $ name_in_label : chr NA NA NA NA ...
## $ classification_name : chr NA NA NA NA ...
## $ locality : chr NA NA NA NA ...
## $ country_region : chr NA NA NA NA ...
## $ address : chr NA NA NA NA ...
## $ lon : num NA NA NA NA NA NA NA NA NA NA ...
## $ lat : num NA NA NA NA NA NA NA NA NA NA ...
## $ type : chr NA NA NA NA ...
## $ loctype : chr NA NA NA NA ...
## $ north : num NA NA NA NA NA NA NA NA NA NA ...
## $ south : num NA NA NA NA NA NA NA NA NA NA ...
## $ east : num NA NA NA NA NA NA NA NA NA NA ...
## $ west : num NA NA NA NA NA NA NA NA NA NA ...
## $ kingdom : chr NA NA NA NA ...
## $ phylum : chr NA NA NA NA ...
## $ class : chr NA NA NA NA ...
## $ order_indiv : chr NA NA NA NA ...
## $ order_drawer : chr NA "Coleoptera" "Coleoptera" NA ...
## $ order : chr NA NA NA NA ...
## $ family_indiv : chr NA NA NA NA ...
## $ family_drawer : chr NA NA "Chrysomelidae" NA ...
## $ sub_family_drawer : chr NA NA NA NA ...
## $ family : chr NA NA NA NA ...
## $ genus : chr NA NA NA NA ...
## $ species : chr NA NA NA NA ...
## $ id : chr NA NA NA NA ...
## $ drawer_details : chr NA NA NA NA ...
## $ num_indiv_specimens : num 0 0 0 0 0 0 0 0 0 0 ...
## $ num_drawer_specimens : num 0 62 195 0 0 0 0 0 0 0 ...
## $ notes : chr NA "Pretty drawer" NA "Sample of W.S. Macleay's handwriting" ...
## $ storage_type : chr "Storage" "Storage" "Storage" "Storage" ...
## $ building_code : chr "A16" "A16" "A16" "A16" ...
## $ building_name : chr "Badham Building" "Badham Building" "Badham Building" "Badham Building" ...
## $ room_code : chr "111" "111" "111" "111" ...
## $ room_name : chr "Macleay Entomology store" "Macleay Entomology store" "Macleay Entomology store" "Macleay Entomology store" ...
## $ cabinet_code : chr "21" "23" "26" "29" ...
## $ drawer_number : num 12 2 9 12 13 14 8 9 10 11 ...
## $ date_recorded : POSIXct, format: "2013-05-02" "2010-07-05" ...
## $ recorded_by : chr "Robert Blackburn" "Robert Blackburn" "Robert Blackburn" "Robert Blackburn" ...
## $ synonymy_l_gray_2017 : chr NA NA NA NA ...
## $ synonymy_r_blackburn_2017: chr NA NA NA NA ...
2.3 Source of the Data
The data was obtained from Sydney University’s Macleay Collection.
Each row represents an entomological collection event while each column represents its characteristics.
2.4 Initial Questions About the Data
1. How large is the data set?
# Looking at the dimensions of the data
dim(entomology)## [1] 35602 41
The data set has 41 columns (variables) and 35 602 rows (entries).
2. What countries/regions do the specimens originate from? How many different countries/regions are there?
# Finding the names of the first and last 5 unique locations, sorted in alphabetical order
head(sort(unique(entomology$country_region)), n = 5)## [1] "algeria" "alps" "amazon river" "amur river"
## [5] "angola"
tail(sort(unique(entomology$country_region)), n = 5)## [1] "usa" "vanuatu" "venezuela" "west indies" "zambia"
# Finding how many locations there are
nrow(table(unique(entomology$country_region)))## [1] 125
From above, 125 countries out of 197 have been recorded ranging from Algeria to Zambia.
3. What is the phylogenetic structure of the collection?
# Creating a data set 'entomology_narm_phylo' without NA values and the false kingdom "incertae sedis"
entomology_narm_phylo = entomology %>%
drop_na(kingdom) %>%
drop_na(phylum) %>%
drop_na(class)
entomology_narm_phylo = subset(entomology_narm_phylo, kingdom != "incertae sedis")
# Creating a phylogenetic path for each specimen
entomology_narm_phylo$pathString = paste("Life",
entomology_narm_phylo$kingdom,
entomology_narm_phylo$phylum,
entomology_narm_phylo$class,
sep = "/")
# Creating a data set 'entomology_narm_phylo_deeper' that goes into the family level
entomology_narm_phylo_deeper = entomology_narm_phylo %>%
drop_na(order) %>%
drop_na(family)
# Creating a data set 'family_summary' with the number of specimens for each order
family_summary = entomology_narm_phylo_deeper %>%
group_by(family) %>%
summarise(num_family = sum(num_indiv_specimens))
# Adding the "num_family" column to 'entomology_narm_phylo_deeper'
entomology_narm_phylo_deeper =
left_join(entomology_narm_phylo_deeper, family_summary)
# Creating a phylogenetic path for each specimen
entomology_narm_phylo_deeper$pathString = paste("Life",
entomology_narm_phylo_deeper$kingdom,
entomology_narm_phylo_deeper$phylum,
entomology_narm_phylo_deeper$class,
entomology_narm_phylo_deeper$order,
entomology_narm_phylo_deeper$family,
sep = "/")
# Creating the 'life_family' data tree structure
life_family = data.tree::as.Node(entomology_narm_phylo_deeper)
# Circle packing of the phylogenetic path to family, coloured by depth
family_pack = circlepackeR(life_family, size = "num_family", color_min = "RGB(0,115,255)", color_max = "RGB(255,0,124)")
# Saving 'family_pack' as an html widget
htmlwidgets::saveWidget(as_widget(family_pack), file = "phylowidget.html")
/>
So it is clear from above that the largest number of specimens in the collection follow the following phylogenetic path:
- Kingdom: Animalia
- Phylum: Anthropoda
- Class: Insecta
- Order: Coleoptera
- Family: Curculionidae
This corresponds to one of the largest animal families in taxonomy. Namely plant feeding beetles:
This phylogenetic structure is further evident in the radial tree network below:
# Creating the 'life_class' data tree structure
life_class = data.tree::as.Node(entomology_narm_phylo)
# Creating a list structure of 'life'
life_class_list = ToListExplicit(life_class, unname = TRUE)
## Making custom colour arrays
# Node circumference
colorVector_stroke = c(rep("red", 1), rep("black", 6), rep("red", 28), rep("black", 60))
jsarray_stroke = paste0('["', paste(colorVector_stroke, collapse = '", "'), '"]')
nodeStrokeJS = JS(paste0('function(d, i) { return ', jsarray_stroke, '[i]; }'))
# Node fill
colorVector_colour = c(rep("white", 1), rep("RGBA(0,0,0,0.59)", 6), rep("RGBA(197,0,0,0.62)", 28), rep("RGBA(0,5,253,0.53)", 60))
jsarray_colour = paste0('["', paste(colorVector_colour, collapse = '", "'), '"]')
nodeColourJS = JS(paste0('function(d, i) { return ', jsarray_colour, '[i]; }'))
# Text
colorVector_text = c(rep("red", 1), rep("black", 6), rep("#ccc", 28), rep("blue", 60))
jsarray_text = paste0('["', paste(colorVector_text, collapse = '", "'), '"]')
textColourJS = JS(paste0('function(d, i) { return ', jsarray_text, '[i]; }'))
# Trace lines
colorVector_link = c(rep("#25D80E", 1), rep(
"RGBA(0,0,0,0.59)", 5), rep("#25D80E", 1), rep(
"RGBA(0,0,0,0.59)", 27), rep("#25D80E", 1), rep(
"RGBA(0,0,0,0.59)", 60))
jsarray_link = paste0('["', paste(colorVector_link, collapse = '", "'), '"]')
linkColourJS = JS(paste0('function(d, i) { return ', jsarray_link, '[i]; }'))
# Radial phylogenetic node network up to class
radialNetwork(life_class_list,
linkColour = linkColourJS,
nodeColour = nodeColourJS,
nodeStroke = nodeStrokeJS,
textColour = textColourJS,
fontSize = 15)Above, the most common path in the network, towards the Insecta class, is highlighted in green.
4. What is the geographical distribution of the collection?
# Creating a subset 'entomology_narm_spatial' without NA values and the false kingdom "incertae sedis"
entomology_narm_spatial = entomology %>%
drop_na(kingdom) %>%
drop_na(lon) %>%
drop_na(lat)
entomology_narm_spatial = subset(entomology_narm_spatial, kingdom != "incertae sedis")
# Creating a new column "kingdom_factor" as a factor type of the kingdom vector
entomology_narm_spatial$kingdom_factor = factor(entomology_narm_spatial$kingdom)
# Creating a layer of the world map
mapWorld = borders("world", colour = "#00AFD9", fill = "white")
# Animated spatial plot of the distribution of specimens with different kingdoms
animated_kingdoms =
ggplot(entomology_narm_spatial,
aes(x = entomology_narm_spatial$lon,
y = entomology_narm_spatial$lat,
colour = kingdom_factor)) +
mapWorld + geom_point(size = 1.5) +
xlab("") + ylab("") +
theme(panel.border = element_blank(),
panel.grid = element_blank(),
plot.background = element_rect(fill = "white"),
panel.background = element_rect(fill = "#DFF7F1"),
axis.ticks = element_blank(),
axis.text = element_blank(),
plot.caption = element_text(hjust = 0.5, colour = "black", size = rel(1.6))) +
labs(colour = "Kingdoms\n", caption = "The Global Distribution of Specimens") +
transition_manual(kingdom_factor)
# Printing it to standard output
print(animated_kingdoms)
# Converting it manually to .png and embedding below
# Making a new colour palette
colourPalette = colfunc = colorRampPalette(c("lightsteelblue", "midnightblue"))
colourPalette = colfunc(6751)
# Creating a frequency table
freq = as.data.frame(table(entomology$country_region))
# Creating the data sets 'countries' and 'insectMap'
countries = data.frame(country = freq$Var1,
DistributionOfInsects = freq$Freq)
insectMap = joinCountryData2Map(countries, joinCode = "NAME",
nameJoinColumn = "country")## 99 codes from your data successfully matched countries in the map
## 26 codes from your data failed to match with a country code in the map
## 145 codes from the map weren't represented in your data
# Create a world shaped window
mapDevice('x11')
# Plotting chloropleth
mapCountryData(insectMap, nameColumnToPlot = "DistributionOfInsects", catMethod = "fixedWidth", numCats = 151, missingCountryCol = "white", colourPalette = colourPalette, borderCol = "midnightblue")From the animated graph above it is clear that the collection has a strong diversity of locations. In particular:
- Australia
- Europe
- and North America
are the most geographically populated regions of the collection.
2.5 Possible Issues with the Data
- Spelling errors and missing values populate the data set, resulting in incorrectly mapped GPS coordinates whilst complicating our the final conclusions - we explicitly target this aspect of the data throughout the contents of this report
table(is.na(entomology))##
## FALSE TRUE
## 969733 489949
489 949 out of 1 459 682 to be precise
- The age of the recorded data has a bearing on its accuracy, as the classification of specimens and localities have shifted dramatically due to taxonomic developments and colonisation since the collection beginnings in the 1760s
- The sample size of the data set is not large enough to make reliable conclusions about the population of entomological specimens around the world
2.6 Domain Knowledge
Taxonomy is the scientific classification of things. Classification is done by organising biological organisms into categories based on similar characteristics. These groups are then ranked into what is known as the taxonomic hierarchy which is represented in the image below:
Caption: The Taxonomic Heirarchy
One example of a phylogenetic path along the taxonomic hierarchy includes that of a common sunflower:
- Kingdom: Plantae
- Phylum: Angiosperms
- Class: Eudicots
- Order: Asterales
- Family: Asteraceae
- Genus: Bidens
- Species: Bidens torta
Butterflies lie within the Lepidoptera order of the Insecta class and are the focus of our project.
The most common families in descending order include: Nymphalidae, Papilionidae and Lycaenidae. Each of these have their own distinct characteristics and geographical distributions. For example the Nymphalidae family, most commonly referred to as the Brush-footed butterflies, are particularly located in the tropics and often exhibit dull colours on their wings such as browns, oranges, yellows and blacks.
Caption: Limenitis camilla of the Nymphalidae family
Within the Nymphalidae family the most common genera are Morpho, Vanessa, and Argynnis; again, each with their own distinct features. For example, butterflies of the Morpho genus are typically coloured in metallic, shimmering shades of blues and greens and inhabit forests of the Amazon and Atlantic.
3 Research Questions
3.1 Butterflies - How does the collection reflect their global distribution and characteristics?
Let’s examine the phylogenetic structure of the butterfly. We can do this by looking at the Lepidoptera order since it encompasses butterflies.
# Creating a data set 'entomology_narm_phylo_even_deeper' that goes to into the family level
entomology_narm_phylo_even_deeper = entomology_narm_phylo %>%
drop_na(order) %>%
drop_na(family) %>%
drop_na(genus)
# Creating a data set 'genus_summary' with the number of specimens for each order
genus_summary = entomology_narm_phylo_even_deeper %>%
group_by(genus) %>%
summarise(num_genus = sum(num_indiv_specimens))
# Subsetting the data for the Lepidoptera order
entomology_narm_phylo_even_deeper = subset(entomology_narm_phylo_even_deeper, order == "Lepidoptera")
# Adding the "num_genus" column to 'entomology_narm_phylo_even_deeper'
entomology_narm_phylo_even_deeper =
left_join(entomology_narm_phylo_even_deeper, genus_summary)
# Creating a phylogenetic path for each butterfly
entomology_narm_phylo_even_deeper$pathString = paste("Life", entomology_narm_phylo_even_deeper$order,
entomology_narm_phylo_even_deeper$family,
entomology_narm_phylo_even_deeper$genus,
sep = "/")
# Creating the 'life_genus_lep' data tree structure
life_genus_lep = data.tree::as.Node(entomology_narm_phylo_even_deeper)
# Circle packing of the phylogenetic path to genus coloured by depth
genus_pack_lep = circlepackeR(life_genus_lep, size = "num_genus", color_min = "RGB(255,0,124)", color_max = "RGB(0,115,255)")
# Saving 'genus_pack_lep' as an html widget
htmlwidgets::saveWidget(as_widget(genus_pack_lep), file = "phylowidget_lep.html")It is clear from above that the largest butterfly families in the collection are:
- Nymphalidae
- Papilionidae
- Lycaenidae
in descending order.
But what are these families’ geographical distributions respectively? Are there distinct differences in location and concentration?
# Creating a subset 'entomology_narm_spatial_bf' without NA values and the false kingdom "incertae sedis"
entomology_narm_spatial_bf = entomology %>%
drop_na(family) %>%
drop_na(lon) %>%
drop_na(lat)
# Creating a subset 'entomology_narm_spatial' without NA values and the false kingdom "incertae sedis"
entomology_narm_spatial_bf = subset(entomology_narm_spatial_bf, family == "Nymphalidae" | family == "Papilionidae" | family == "Lycaenidae")
# Creating a new column "family_factor" as a factor type of the family vector
entomology_narm_spatial_bf$family_factor = factor(entomology_narm_spatial_bf$family)
# Adding Capricorn and Cancer tropic lines
cancer = data.frame(y = 23.5, cancer = factor(23.5))
capricorn = data.frame(x = c(-Inf, Inf), y = -23.5, capricorn = factor(-23.5))
# Animated spatial plot of the distribution of butterfly families
animated_family_bf =
ggplot(entomology_narm_spatial_bf,
aes(x = entomology_narm_spatial_bf$lon,
y = entomology_narm_spatial_bf$lat,
colour = entomology_narm_spatial_bf$family_factor)) +
mapWorld + geom_point(size = 2) +
xlab("") + ylab("") +
theme(panel.border = element_blank(),
panel.grid = element_blank(),
plot.background = element_rect(fill = "white"),
panel.background = element_rect(fill = "#DFF7F1"),
axis.ticks = element_blank(),
axis.text = element_blank(),
plot.caption = element_text(hjust = 0.5, colour = "black", size = rel(1.6))) +
transition_manual(entomology_narm_spatial_bf$family_factor) +
labs(colour = "Families\n", caption = "Largest Bufferfly Families")
# Add tropical lines of Cancer and Capricorn
animated_family_bf = animated_family_bf +
geom_hline(aes(yintercept = y, linetype = cancer), data = cancer, show.legend = F) +
geom_hline(aes(yintercept = y, linetype = capricorn), data = capricorn, show.legend = F)
# Printing it to standard output
print(animated_family_bf)
# Converting it manually to 'Bf_family.png' and embedding belowAs seen in the animated graph above, the Nymphalidae family are distributed worldwide but are especially rich in the tropics, as seen in the region between the two horizontal lines above. Butterflies of this family often migrate from northern latitudes such as Canada down to tropical latitudes such as Mexico. This migration path is also evident in the animated graph.
The Papilionidae family are also distributed worldwide, with a higher concentration in the tropics as seen above. Butterflies of this family are commonly referred to as the Swallowtail butterflies because they are known for the tail-like extensions of their hind wings. FUN FACT: Their colour patterns are very diverse, ranging from yellows and oranges, to greens and blues.
The Lycaenidae family is similarly distributed in the graph above but with the least concentration. The Lycaenidae are commonly referred to as the Gossamer-winged butterflies. This family is traditionally subdivided into four categories; the blues, the coppers, the hair streaks and the harvesters.
- Although our data unfortunately doesn’t distinguish between these four sub-families, the blues are richest in the Old-World tropics (Africa, Asia and Europe) and north temperate zones, the coppers are especially dominant in north temperate regions, whilst the hair streaks are particularly abundant in New-World tropics (Oceania and Americas). FUN FACT: Their wings are usually brightly coloured and often have false-eye markings which serve as decoys for predators.
Elaborating forth, let’s examine the distribution of Nymphalidae butterflies within its three largest genera: Vanessa, Argynnis and Morpho.
# Creating a subset 'entomology_narm_spatial_bf_genus' without NA values
entomology_narm_spatial_bf_genus = entomology %>%
drop_na(genus) %>%
drop_na(lon) %>%
drop_na(lat)
# Creating a subset 'entomology_narm_spatial' without NA values and the false kingdom "incertae sedis"
entomology_narm_spatial_bf_genus = subset(entomology_narm_spatial_bf_genus, genus == "Morpho" | genus == "Vanessa" | genus == "Argynnis")
# Creating a new column "genus_factor" as a factor type of the genus vector
entomology_narm_spatial_bf_genus$genus_factor = factor(entomology_narm_spatial_bf_genus$genus)
# Animated spatial plot of the distribution of butterfly families
animated_genus_bf =
ggplot(entomology_narm_spatial_bf_genus,
aes(x = entomology_narm_spatial_bf_genus$lon,
y = entomology_narm_spatial_bf_genus$lat,
colour = genus_factor)) +
mapWorld + geom_point(size = 2) +
xlab("") + ylab("") +
theme(panel.border = element_blank(),
panel.grid = element_blank(),
plot.background = element_rect(fill = "white"),
panel.background = element_rect(fill = "#DFF7F1"),
axis.ticks = element_blank(),
axis.text = element_blank(),
plot.caption = element_text(hjust = 0.3, colour = "black", size = rel(1.6))) +
labs(colour = "Genera\n", caption = "Largest Bufferfly Genera of the Nymphalidae Family") +
transition_manual(genus_factor)
# Printing it to standard output
print(animated_genus_bf)
# Converting it manually to 'Bf_genus.png' and embedding below
colourPalette = colfunc <- colorRampPalette(c("lavenderblush", "deeppink"))
colourPalette = colfunc(151)
freq = as.data.frame(table(entomology_narm_spatial_bf$country_region))
countries <- data.frame(country = freq$Var1,
DistributionOfNymphalidae = freq$Freq)
nymphMap <- joinCountryData2Map(countries, joinCode = "NAME",
nameJoinColumn = "country")## 52 codes from your data successfully matched countries in the map
## 2 codes from your data failed to match with a country code in the map
## 191 codes from the map weren't represented in your data
mapDevice('x11')
mapCountryData(nymphMap, nameColumnToPlot="DistributionOfNymphalidae", catMethod = "fixedWidth", numCats = 151,
missingCountryCol = "white", colourPalette=colourPalette, borderCol = "maroon4")As seen in the animated graph and chloropleth above, butterflies of the Morpho genus appear to primarily inhabit forests of the Amazon and Atlantic. These are typically coloured in metallic, shimmering shades of blues and greens.
The Vanessa genus are distributed globally, appearing more often in the northern hemisphere. These butterflies cover a broad spectrum in terms of colouration. FUN FACT: They are named after the character Vanessa in Jonathan Swift’s poem, “Cadenus and Vanessa”, who is referred to as a “nymph” eleven times.
The Argynnis genus appears to be highly concentrated in the Americas despite being most commonly found in Europe and Asia. This most likely reflects the unreliability of the data (in terms of sufficiency and completeness), or perhaps a new insight into the location of these butterflies. FUN FACT: Argynnis butterflies are commonly known as the fritillaries.
Now let’s discuss the common habitat temperature for butterflies in the collection. What is it’s spread and does this align with our domain knowledge?
## Merging 'entomology' with new temperature data from the OpenWeatherMap API
# Made a subset csv first to go into python
ent_spatial = entomology_narm_spatial_bf[, c(1:7)]
write_csv(ent_spatial, "ent_spatial.csv")# Writing a python script to receive temperature data
run = "false" # So it doesn't run in RStudio
if run == "true":
# Importing an http request library and csv read/writer
import requests
import csv
# Global empties
latitude = []
longitude = []
# Opening the csv file of latitude and longtitude data
with open("ent_spatial.csv") as f:
ent_spatial = csv_reader = csv.reader(f, delimiter=",")
for row in ent_spatial:
longitude.append(row[5])
latitude.append(row[6])
f.close()
# Removing the labels from the csv -> they won't parse
latitude.pop(0)
longitude.pop(0)
api_key = "b944d73972b0d4a0363d3d74fdd07487"
i = 0
length = len(latitude)
temps = []
# Requesting data for each lat/long pair
for i in range(length):
url = "http://api.openweathermap.org/data/2.5/weather?lat={0}&lon={1}&appid={2}".format(latitude[i], longitude[i], api_key)
r = requests.get(url)
# Data comes back as dictionaries
data = r.json()
temps.append(data['main']['temp'])
# Progress monitor
print("{}/{}".format(i,length))
print(temps)
temps.insert(0, "temp")
# Writing the data to the 'temp3.csv' file
with open ("temps3.csv", "w") as f:
writer = csv.writer(f)
writer.writerow(f)
f.close()# Importing the data set
kel_temp = read.csv("temps3.csv")
kel_temp = setNames(kel_temp, c("temperature"))
# Converting the temperature data from Kelvin to Celcius
cel_temp = kelvin.to.celsius(kel_temp, 2)
# Appending 'cel_temp' to 'entomology_narm_spatial_bf'
entomology_narm_spatial_bf = cbind(entomology_narm_spatial_bf, cel_temp)# Box plot of the habitat temperature distribution of the largest butterfly families
plot_ly(x = ~entomology_narm_spatial_bf$temperature, type = "box", color = ~entomology_narm_spatial_bf$family_factor) %>%
layout(title = "Habitat Temperature Distribution for the Largest Butterfly Families", xaxis = list(title = "Avg Habitat Temperature (degrees Celcius)"))From the box plot above, it is clear that the Lycaenidae family prefers hotter climates than both the Papilionidae and Nymphalidae families which share similar trends. The median habitat temperature for the Lycaenidae family is 26.95 degrees Celsius, 6 degrees Celsius more than the other two families which share a common median of 20.95 degrees Celsius.
The spread of the Lycaenidae family is also much greater, with an IQR of 20°C compared to 14°C for the other two.
This reflects their true habitat, being most commonly found in rainforest climates.
3.2 Flies - A brief glimpse
Similar to our previous examination, but instead now let’s take a brief glimpse at the distribution of flies.
# Making a new icon and assigning it to the image of a fly
url_fly = "https://i2.wp.com/freepngimages.com/wp-content/uploads/2014/04/fly_2.png?fit=220%2C220"
fly = leaflet::makeIcon(url_fly, url_fly, 20, 20)
# Spatial plot of the distribution of flies (order Diptera)
leaflet(entomology[entomology$order == "Diptera", ], options = leafletOptions(minZoom = 1, maxZoom = 18, worldCopyJump = TRUE)) %>%
addProviderTiles("OpenTopoMap") %>%
addMarkers(lng = ~lon, lat = ~lat,
icon = fly,
popup = ~paste0(classification_name,
"<br/>Family: ", family,
"<br/>Location: ", locality),
label = ~family)4 Session Info
sessionInfo()## R version 3.5.2 (2018-12-20)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 17134)
##
## Matrix products: default
##
## locale:
## [1] LC_COLLATE=English_Australia.1252 LC_CTYPE=English_Australia.1252
## [3] LC_MONETARY=English_Australia.1252 LC_NUMERIC=C
## [5] LC_TIME=English_Australia.1252
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] reticulate_1.12 dichromat_2.0-0
## [3] rworldmap_1.3-6 sp_1.3-1
## [5] weathermetrics_1.2.2 magick_2.0
## [7] gapminder_0.3.0 gifski_0.8.6
## [9] gganimate_1.0.3 htmlwidgets_1.3
## [11] circlepackeR_0.0.0.9000 viridis_0.5.1
## [13] viridisLite_0.3.0 igraph_1.2.4
## [15] ggraph_1.0.2 networkD3_0.4
## [17] treemap_2.4-2 data.tree_0.7.8
## [19] plotly_4.9.0 leaflet_2.0.2
## [21] mapview_2.6.3 prettydoc_0.2.1
## [23] maps_3.3.0 skimr_1.0.5
## [25] janitor_1.1.1 forcats_0.4.0
## [27] dplyr_0.8.0.1 purrr_0.3.2
## [29] readr_1.3.1 tibble_2.1.1
## [31] ggplot2_3.1.1 tidyverse_1.2.1
## [33] stringr_1.4.0 tidyr_0.8.3
## [35] readxl_1.3.1 kableExtra_1.1.0
## [37] knitr_1.22.8
##
## loaded via a namespace (and not attached):
## [1] colorspace_1.4-1 class_7.3-14 snakecase_0.9.2
## [4] satellite_1.0.1 markdown_0.9 base64enc_0.1-3
## [7] rstudioapi_0.10 farver_1.1.0 ggrepel_0.8.0
## [10] lubridate_1.7.4 xml2_1.2.0 codetools_0.2-15
## [13] polyclip_1.10-0 spam_2.2-2 jsonlite_1.6
## [16] broom_0.5.2 gridBase_0.4-7 png_0.1-7
## [19] ggforce_0.2.1 shiny_1.3.1 DiagrammeR_1.0.0
## [22] compiler_3.5.2 httr_1.4.0 backports_1.1.4
## [25] Matrix_1.2-15 assertthat_0.2.1 lazyeval_0.2.2
## [28] cli_1.1.0 later_0.8.0 tweenr_1.0.1
## [31] prettyunits_1.0.2 visNetwork_2.0.6 htmltools_0.3.6
## [34] tools_3.5.2 dotCall64_1.0-0 gtable_0.3.0
## [37] glue_1.3.1 Rcpp_1.0.1 cellranger_1.1.0
## [40] rgexf_0.15.3 raster_2.8-19 nlme_3.1-137
## [43] crosstalk_1.0.0 xfun_0.6 rvest_0.3.3
## [46] mime_0.6 XML_3.98-1.19 MASS_7.3-51.1
## [49] scales_1.0.0 hms_0.4.2 promises_1.0.1
## [52] RColorBrewer_1.1-2 fields_9.6 yaml_2.2.0
## [55] gridExtra_2.3 downloader_0.4 stringi_1.4.3
## [58] highr_0.8 maptools_0.9-5 Rook_1.1-1
## [61] e1071_1.7-1 rlang_0.3.4 pkgconfig_2.0.2
## [64] evaluate_0.13 lattice_0.20-38 sf_0.7-3
## [67] labeling_0.3 tidyselect_0.2.5 plyr_1.8.4
## [70] magrittr_1.5 R6_2.4.0 generics_0.0.2
## [73] DBI_1.0.0 foreign_0.8-71 pillar_1.3.1
## [76] haven_2.1.0 withr_2.1.2 units_0.6-2
## [79] modelr_0.1.4 crayon_1.3.4 rmarkdown_1.12
## [82] progress_1.2.0 grid_3.5.2 data.table_1.12.2
## [85] influenceR_0.1.0 digest_0.6.18 classInt_0.3-1
## [88] webshot_0.5.1 xtable_1.8-3 httpuv_1.5.1
## [91] brew_1.0-6 stats4_3.5.2 munsell_0.5.0
5 References
Animals, A. (2019). Butterfly Families | Different Species and Family of Butterflies. Retrieved from https://animalcorner.co.uk/butterfly-families/
ButterflyCorner.net: Genus: ARGYNNIS. (2019). Retrieved from https://en.butterflycorner.net/Genus-ARGYNNIS.347.0.html
ButterflyCorner.net: Genus: VANESSA. (2019). Retrieved from https://en.butterflycorner.net/Genus-VANESSA.350.0.html
Family Lycaenidae (Gossamer-wing Butterflies) | Butterflies and Moths of North America. (2019). Retrieved from https://www.butterfliesandmoths.org/taxonomy/lycaenidae
Family Nymphalidae (Brush-footed Butterflies) | Butterflies and Moths of North America. (2019). Retrieved from https://www.butterfliesandmoths.org/taxonomy/Nymphalidae
Family Papilionidae (Parnassians and Swallowtails) | Butterflies and Moths of North America. (2019). Retrieved from https://www.butterfliesandmoths.org/taxonomy/Papilionidae
Kitching, R. (1999). Biology of Australian butterflies. Collingwood: CSIRO. Morpho Butterflies (Genus Morpho). (2019). Retrieved from https://www.inaturalist.org/taxa/68755-Morpho
Phylum - Definition and Examples | Biology Dictionary. (2019). Retrieved from https://biologydictionary.net/phylum/
Weather API - OpenWeatherMap. (2019). Retrieved from https://openweathermap.org/api